# Data analysis for JAA paper
Ben Marwick, 2013

This script contains R code used to generate several of the figures and tables
published in:

Marwick B 2013. Multiple Optima in Hoabinhian flaked stone artefact palaeoeconomics and palaeoecology at two archaeological sites in Northwest Thailand. Journal of Anthropological Archaeology 32(4), 553-564. http://dx.doi.org/10.1016/j.jaa.2013.08.004

Accompanying data files are available online at: 

Marwick, Ben (2013): Stone artefacts and human ecology at two rockshelters in Northwest Thailand. figshare. http://dx.doi.org/10.6084/m9.figshare.765252

The data were analysed in the following environment:
  
sessionInfo()
R version 3.0.2 (2013-09-25)
Platform: x86_64-w64-mingw32/x64 (64-bit)

locale:
  [1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
  [1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
  [1] e1071_1.6-2     class_7.3-9     ggplot2_0.9.3.1

loaded via a namespace (and not attached):
  [1] colorspace_1.2-4   dichromat_2.0-0    digest_0.6.4       grid_3.0.2         gtable_0.1.2      
[6] labeling_0.2       MASS_7.3-29        munsell_0.4.2      plyr_1.8           proto_0.3-10      
[11] RColorBrewer_1.0-5 reshape2_1.2.2     scales_0.2.3       stringr_0.6.2      tools_3.0.2 






################### Tham Lod ##################################
# open data in R
TL <- read.csv("Tham_Lod_Area_1_lithics.csv")

# compute distribution of complete flake lengths
# subset complete flakes
TL_cf <- TL[TL$FLAKE_TYPE == 'Complete', ]
library(ggplot2)
ggplot(TL_cf, aes(x=PERC_LENTH)) + 
  geom_density()

# compute distribution of flake scar lengths on cores
# subset cores
TL_cor <- TL[TL$ARTEFACT_C == 'Core', ]
# get average lengths of last 5 flake scars

# get vector of col names to average over
tmp <- NULL
for(i in 1:5){
  tmp[i] <- paste0("FS",i,"_LENGTH")
}
# compute means for flake scars for each core
TL_cor$cor_scr_len_ave <- sapply(1:nrow(TL_cor), function(i) mean(as.numeric(TL_cor[i,tmp]), na.rm = TRUE))

# plot
ggplot(TL_cor, aes(x=cor_scr_len_ave)) + 
  geom_density()

# plot both flake lengths and core scar lengths
ggplot() + 
  geom_density(data=TL_cor, aes(x=cor_scr_len_ave),  size=0.75, linetype="dashed") +
  geom_density(data=TL_cf, aes(x=PERC_LENTH),  size=0.75) +
  xlab("millimeters") + 
  theme(panel.background = element_blank(),          # suppress default background
        panel.grid.major = element_blank(),              # suppress default major gridlines
        panel.grid.minor = element_blank(),              # suppress default minor gridlines
        axis.ticks = element_blank(),                    # suppress tick marks
        axis.title.x = element_text(size=13),              # increase axis title size slightly  
        axis.title.y = element_text(size=13, angle=90),    # increase axis title size slightly and rotate
        axis.text.x = element_text(size=13, colour = "black"),               # increase size of numbers on x-axis
        axis.text.y = element_text(size=13, colour = "black"))               # increase size of numbers on y-axis
##
ggsave('Marwick_Figure_2_PDF_TL.jpg', dpi = 600)


# statistical test of difference in distribution
# of the two lengths
TL_fl <- na.omit(TL_cor$cor_scr_len_ave)
TL_co <- na.omit(TL_cf$PERC_LENTH)
# t-test
t.test(TL_fl, TL_co)
# Bayesian estimation 
#library(BEST)
#BEST_out_TL <- BESTmcmc(TL_fl, TL_co)
#plot(BEST_out_TL)


################### Ban Rai ##################################
# open data in R
BR <- read.csv("Ban_Rai_Area_3_lithics.csv")

# compute distribution of complete flake lengths
# subset complete flakes
BR_cf <- BR[BR$FLAKE_TYPE == 'Complete', ]
library(ggplot2)
ggplot(BR_cf, aes(x=PERC_LENTH)) + 
  geom_density()

# compute distribution of flake scar lengths on cores
# subset cores
BR_cor <- BR[BR$ARTEFACT_C == 'Core', ]
# get average lengths of last 5 flake scars

# get vector of col names to average over
for(i in 1:5){
  tmp[i] <- paste0("FS",i,"_LENGTH")
}
# compute means for flake scars for each core
BR_cor$cor_scr_len_ave <- sapply(1:nrow(BR_cor), function(i) mean(as.numeric(BR_cor[i,tmp]), na.rm = TRUE))

# plot
ggplot(BR_cor, aes(x=cor_scr_len_ave)) + 
  geom_density()

# plot both flake lengths and core scar lengths
ggplot() + 
  geom_density(data=BR_cor, aes(x=cor_scr_len_ave),  size=0.75, linetype="dashed") +
  geom_density(data=BR_cf, aes(x=PERC_LENTH),  size=0.75) +
  xlab("millimeters") + 
  theme(panel.background = element_blank(),          # suppress default background
       panel.grid.major = element_blank(),              # suppress default major gridlines
       panel.grid.minor = element_blank(),              # suppress default minor gridlines
       axis.ticks = element_blank(),                    # suppress tick marks
       axis.title.x = element_text(size=13),              # increase axis title size slightly  
       axis.title.y = element_text(size=13, angle=90),    # increase axis title size slightly and rotate
       axis.text.x = element_text(size=13, colour = "black"),               # increase size of numbers on x-axis
       axis.text.y = element_text(size=13, colour = "black"))               # increase size of numbers on y-axis
##
ggsave('Marwick_Figure_3_PDF_BR.jpg', dpi = 600)

# statistical test of difference in distribution
# of the two lengths
BR_fl <- na.omit(BR_cor$cor_scr_len_ave)
BR_co <- na.omit(BR_cf$PERC_LENTH)
# t-test
t.test(BR_fl, BR_co)
# Bayesian estimation 
#library(BEST)
#BEST_out_BR <- BESTmcmc(BR_fl, BR_co)
#plot(BEST_out_BR)


################### 2008 experimental data ############################
# open data in R
EX <- read.csv("Marwick_2008_JAS_data.csv")

# compute distribution of complete flake lengths
# subset complete flakes
EX_cf <- EX[EX$Flake.Type == 'Complete', ]
library(ggplot2)
ggplot(EX_cf, aes(x=Perc.Lenth..mm.)) + 
  geom_density()

# compute distribution of flake scar lengths on cores
# subset cores
EX_cor <- EX[EX$Artefact.Class == 'Core', ]
# get average lengths of last 5 flake scars

# get vector of col names to average over
tmp <- NULL
for(i in 1:5){
  tmp[i] <- paste0("FS",i,".Length..mm.")
}
# compute means for flake scars for each core
EX_cor$cor_scr_len_ave <- sapply(1:nrow(EX_cor), function(i) mean(as.numeric(EX_cor[i,tmp]), na.rm = TRUE))

# plot
ggplot(EX_cor, aes(x=cor_scr_len_ave)) + 
  geom_density()

# plot both flake lengths and core scar lengths
ggplot() + 
  geom_density(data=EX_cor, aes(x=cor_scr_len_ave),  size=0.75, linetype="dashed") +
  geom_density(data=EX_cf, aes(x=Perc.Lenth..mm.),  size=0.75) +
  xlab("millimeters") +  
  theme(panel.background = element_blank(),          # suppress default background
        panel.grid.major = element_blank(),              # suppress default major gridlines
        panel.grid.minor = element_blank(),              # suppress default minor gridlines
        axis.ticks = element_blank(),                    # suppress tick marks
        axis.title.x = element_text(size=13),              # increase axis title size slightly  
        axis.title.y = element_text(size=13, angle=90),    # increase axis title size slightly and rotate
        axis.text.x = element_text(size=13, colour = "black"),               # increase size of numbers on x-axis
        axis.text.y = element_text(size=13, colour = "black"))               # increase size of numbers on y-axis
##
ggsave('Marwick_Figure_4_PDF_EX.jpg', dpi = 600)


# statistical test of difference in distribution
# of the two lengths
EX_fl <- na.omit(EX_cor$cor_scr_len_ave)
EX_co <- na.omit(EX_cf$Perc.Lenth..mm.)
# t-test
t.test(EX_fl, EX_co)
# Bayesian estimation 
#library(BEST)
#BEST_out_EX <- BESTmcmc(fl, co)
#plot(BEST_out_EX)
#plotAll(BEST_out_EX)

####################### compute skewness ######

library(e1071) 

# TL
skewness(TL_fl) ; skewness(TL_co) 

# BR
skewness(BR_fl) ; skewness(BR_co)

# EX
skewness(EX_fl) ; skewness(EX_co)

## significance test

# Create a single array of all the samples from both distributions.
# Randomly shuffle the array .
# Split the array at index N1.
# Calculate the abs() of difference of the skewness/kurtosis statistics for the two partitions.
# Repeat until desired precision/out of unique permutations.
# Then check if the actual abs() difference of skewness/kurtosis statistics lie beyond the p=0.05 interval.

# resampling test for difference in skew
skw <- c(TL_fl, BR_fl)
n = 100000
dif <- vector(length = n)
for(i in 1:n){
shf <- sample(skw)
spl1 <- shf[1:length(TL_fl)]
spl2 <- shf[length(TL_fl):length(shf)]
dif[i] <- abs(skewness(spl1) - skewness(spl2))
}
# plot
hist(dif)
v <-  abs(skewness(TL_fl) - skewness(BR_fl))
abline( v = v, col = "red", lwd = 2)

# compute a p-value from the resampled distribution
sum(dif[which(dif > v)]) / sum(dif)


####################### compute mean and sds ######

# TL
mean(TL_fl); mean(TL_co) 
sd(TL_fl); sd(TL_co)

# BR
mean(BR_fl); mean(BR_co)
sd(BR_fl); sd(BR_co)

# EX
mean(EX_fl); mean(EX_co)
sd(EX_fl); sd(EX_co)

####################### compute peaks values and differences ######

# TL
xTL_fl <- density(TL_fl)
xTL_fl <- xTL_fl$x[which.max(xTL_fl$y)] 

# area of curve RHS of peak
xTL_fl <- density(TL_fl)
xTL_fl_RHS <- sum(xTL_fl$x[which.max(xTL_fl$y):length(xTL_fl$x)]) 
# proportion of area to RHS of peak
xTL_fl_RHS / sum(xTL_fl$x)

xTL_co <- density(TL_co)
xTL_co <- xTL_co$x[which.max(xTL_co$y)]

# area of curve RHS of peak
xTL_co <- density(TL_co)
xTL_co_RHS <- sum(xTL_co$x[which.max(xTL_co$y):length(xTL_co$x)]) 
# proportion of area to RHS of peak
xTL_co_RHS / sum(xTL_co$x)


# BR
xBR_fl <- density(BR_fl)
xBR_fl <- xBR_fl$x[which.max(xBR_fl$y)] 
# area of curve RHS of peak
xBR_fl <- density(BR_fl)
xBR_fl_RHS <- sum(xBR_fl$x[which.max(xBR_fl$y):length(xBR_fl$x)]) 
# proportion of area to RHS of peak
xBR_fl_RHS / sum(xBR_fl$x)

xBR_co <- density(BR_co)
xBR_co <- xBR_co$x[which.max(xBR_co$y)]

# area of curve RHS of peak
xBR_co <- density(BR_co)
xBR_co_RHS <- sum(xBR_co$x[which.max(xBR_co$y):length(xBR_co$x)]) 
# proportion of area to RHS of peak
xBR_co_RHS / sum(xBR_co$x)

xBR_co - xBR_fl


# EX
xEX_fl <- density(EX_fl)
xEX_fl <- xEX_fl$x[which.max(xEX_fl$y)] 

# area of curve RHS of peak
xEX_fl<- density(EX_fl)
xEX_fl_RHS <- sum(xEX_fl$x[which.max(xEX_fl$y):length(xEX_fl$x)]) 
# proportion of area to RHS of peak
xEX_fl_RHS / sum(xEX_fl$x)

xEX_co <- density(EX_co)
xEX_co <- xEX_co$x[which.max(xEX_co$y)]

# area of curve RHS of peak
xEX_co<- density(EX_co)
xEX_co_RHS <- sum(xEX_co$x[which.max(xEX_co$y):length(xEX_co$x)]) 
# proportion of area to RHS of peak
xEX_co_RHS / sum(xEX_co$x)

xEX_co - xEX_fl


          
          